{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Solving classification problems with CatBoost"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/catboost/tutorials/blob/master/events/intel_hands_on_moscow_oct_11_2019.ipynb)\n",
    "\n",
    "In this tutorial we will use dataset Amazon Employee Access Challenge from [Kaggle](https://www.kaggle.com) competition for our experiments. Data can be downloaded [here](https://www.kaggle.com/c/amazon-employee-access-challenge/data)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "np.set_printoptions(precision=4)\n",
    "\n",
    "import catboost\n",
    "print(catboost.__version__)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Reading the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from catboost.datasets import amazon\n",
    "\n",
    "(train_df, test_df) = amazon()\n",
    "\n",
    "train_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "\n",
    "## Preparing the data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Label values extraction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = train_df.ACTION\n",
    "X = train_df.drop('ACTION', axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Categorical features declaration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_features = list(range(0, X.shape[1]))\n",
    "print(cat_features)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Looking on label balance in dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Labels: {}'.format(set(y)))\n",
    "print('Zero count = {}, One count = {}'.format(len(y) - sum(y), sum(y)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ways to create Pool class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_dir = './amazon'\n",
    "if not os.path.exists(dataset_dir):\n",
    "    os.makedirs(dataset_dir)\n",
    "\n",
    "train_df.to_csv(\n",
    "    os.path.join(dataset_dir, 'train.csv'),\n",
    "    index=False, sep=',', header=True\n",
    ")\n",
    "test_df.to_csv(\n",
    "    os.path.join(dataset_dir, 'test.csv'),\n",
    "    index=False, sep=',', header=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!head -3 amazon/train.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from catboost.utils import create_cd\n",
    "feature_names = dict(list(enumerate(train_df.keys()))[1:])\n",
    "    \n",
    "create_cd(\n",
    "    label=0,\n",
    "    cat_features=list(range(1, train_df.shape[1])),\n",
    "    feature_names=feature_names,\n",
    "    output_path=os.path.join(dataset_dir, 'train.cd')\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!cat amazon/train.cd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from catboost import Pool\n",
    "\n",
    "pool1 = Pool(data=X, label=y, cat_features=cat_features)\n",
    "\n",
    "pool2 = Pool(\n",
    "    data=os.path.join(dataset_dir, 'train.csv'), \n",
    "    delimiter=',', \n",
    "    column_description=os.path.join(dataset_dir, 'train.cd'),\n",
    "    has_header=True\n",
    ")\n",
    "\n",
    "print('Dataset shape: {}\\n'.format(pool1.shape))\n",
    "print('Column names: {}'.format(pool1.get_feature_names()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from catboost import CatBoostClassifier\n",
    "\n",
    "CatBoostClassifier(iterations=3).fit(pool1)\n",
    "CatBoostClassifier(iterations=3).fit(pool2)\n",
    "CatBoostClassifier(iterations=3).fit(X, y, cat_features=cat_features);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Split your data into train and validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "data = train_test_split(X, y, train_size=0.8, random_state=0)\n",
    "X_train, X_validation, y_train, y_validation = data\n",
    "\n",
    "train_pool = Pool(\n",
    "    data=X_train, \n",
    "    label=y_train, \n",
    "    cat_features=cat_features\n",
    ")\n",
    "\n",
    "validation_pool = Pool(\n",
    "    data=X_validation, \n",
    "    label=y_validation, \n",
    "    cat_features=cat_features\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CatBoostClassifier(\n",
    "    iterations=5,\n",
    "    learning_rate=0.1,\n",
    ")\n",
    "model.fit(train_pool, eval_set=validation_pool, verbose=False)\n",
    "\n",
    "print('Model is fitted: {}'.format(model.is_fitted()))\n",
    "print('Model params:\\n{}'.format(model.get_params()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Stdout of the training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "model = CatBoostClassifier(\n",
    "    iterations=15,\n",
    "#     verbose=5,\n",
    ")\n",
    "model.fit(train_pool, eval_set=validation_pool);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Metrics calculation and graph plotting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CatBoostClassifier(\n",
    "    iterations=50,\n",
    "    learning_rate=0.5,\n",
    "    custom_loss=['AUC', 'Accuracy']\n",
    ")\n",
    "\n",
    "model.fit(\n",
    "    train_pool,\n",
    "    eval_set=validation_pool,\n",
    "    verbose=False,\n",
    "    plot=True\n",
    ");"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model1 = CatBoostClassifier(\n",
    "    learning_rate=0.7,\n",
    "    iterations=100,\n",
    "    train_dir='learing_rate_0.7'\n",
    ")\n",
    "\n",
    "model2 = CatBoostClassifier(\n",
    "    learning_rate=0.01,\n",
    "    iterations=100,\n",
    "    train_dir='learing_rate_0.01'\n",
    ")\n",
    "\n",
    "model1.fit(train_pool, eval_set=validation_pool, verbose=False)\n",
    "model2.fit(train_pool, eval_set=validation_pool, verbose=False);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from catboost import MetricVisualizer\n",
    "MetricVisualizer(['learing_rate_0.7', 'learing_rate_0.01']).start()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Best iteration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CatBoostClassifier(\n",
    "    iterations=100,\n",
    "    learning_rate=0.5,\n",
    "#     use_best_model=False\n",
    ")\n",
    "model.fit(\n",
    "    train_pool,\n",
    "    eval_set=validation_pool,\n",
    "    verbose=False,\n",
    "    plot=True\n",
    ");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Tree count: ' + str(model.tree_count_))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Overfitting Detector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_with_early_stop = CatBoostClassifier(\n",
    "    iterations=200,\n",
    "    learning_rate=0.5,\n",
    "    early_stopping_rounds=20\n",
    ")\n",
    "\n",
    "model_with_early_stop.fit(\n",
    "    train_pool,\n",
    "    eval_set=validation_pool,\n",
    "    verbose=False,\n",
    "    plot=True\n",
    ");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(model_with_early_stop.tree_count_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Overfitting Detector with eval metric"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_with_early_stop = CatBoostClassifier(\n",
    "    eval_metric='AUC',\n",
    "    iterations=200,\n",
    "    learning_rate=0.5,\n",
    "    early_stopping_rounds=20\n",
    ")\n",
    "model_with_early_stop.fit(\n",
    "    train_pool,\n",
    "    eval_set=validation_pool,\n",
    "    verbose=False,\n",
    "    plot=True\n",
    ");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(model_with_early_stop.tree_count_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cross-validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from catboost import cv\n",
    "\n",
    "params = {\n",
    "    'loss_function': 'Logloss',\n",
    "    'iterations': 80,\n",
    "    'custom_loss': 'AUC',\n",
    "    'learning_rate': 0.5,\n",
    "}\n",
    "\n",
    "cv_data = cv(\n",
    "    params = params,\n",
    "    pool = train_pool,\n",
    "    fold_count=5,\n",
    "    shuffle=True,\n",
    "    partition_random_seed=0,\n",
    "    plot=True,\n",
    "    stratified=True,\n",
    "    verbose=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cv_data.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_value = cv_data['test-Logloss-mean'].min()\n",
    "best_iter = cv_data['test-Logloss-mean'].values.argmin()\n",
    "\n",
    "print('Best validation Logloss score: {:.4f}±{:.4f} on step {}'.format(\n",
    "    best_value,\n",
    "    cv_data['test-Logloss-std'][best_iter],\n",
    "    best_iter)\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Grid Search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CatBoostClassifier(iterations=10, eval_metric='AUC')\n",
    "grid = {'learning_rate': [0.001, 0.01, 0.1, 1.0, 10.0]}\n",
    "result = model.grid_search(grid, train_pool)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(result['params'])\n",
    "print(result['cv_results'].keys())\n",
    "print(result['cv_results']['test-AUC-mean'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.get_params()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.grid_search(grid, train_pool, plot=True, verbose=False);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "More about parameter tuning you can find in [tutorial](https://github.com/catboost/catboost/blob/master/catboost/tutorials/hyperparameters_tuning/hyperparameters_tuning.ipynb)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CatBoostClassifier(iterations=200, learning_rate=0.03)\n",
    "\n",
    "model.fit(\n",
    "    train_pool,\n",
    "    verbose=False,\n",
    "    plot=True\n",
    ");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(model.predict_proba(X_validation))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(model.predict(X_validation))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Select decision boundary"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![](https://habrastorage.org/webt/y4/1q/yq/y41qyqfm9mcerp2ziys48phpjia.png)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from catboost.utils import get_roc_curve\n",
    "from catboost.utils import get_fpr_curve\n",
    "from catboost.utils import get_fnr_curve\n",
    "\n",
    "curve = get_roc_curve(model, validation_pool)\n",
    "(fpr, tpr, thresholds) = curve\n",
    "\n",
    "(thresholds, fpr) = get_fpr_curve(curve=curve)\n",
    "(thresholds, fnr) = get_fnr_curve(curve=curve)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(16, 8))\n",
    "style = {'alpha':0.5, 'lw':2}\n",
    "\n",
    "plt.plot(thresholds, fpr, color='blue', label='FPR', **style)\n",
    "plt.plot(thresholds, fnr, color='green', label='FNR', **style)\n",
    "\n",
    "plt.xlim([0.0, 1.0])\n",
    "plt.ylim([0.0, 1.05])\n",
    "plt.xticks(fontsize=16)\n",
    "plt.yticks(fontsize=16)\n",
    "plt.grid(True)\n",
    "plt.xlabel('Threshold', fontsize=16)\n",
    "plt.ylabel('Error Rate', fontsize=16)\n",
    "plt.title('FPR-FNR curves', fontsize=20)\n",
    "plt.legend(loc=\"lower left\", fontsize=16);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from catboost.utils import select_threshold\n",
    "\n",
    "print(select_threshold(model, validation_pool, FNR=0.01))\n",
    "print(select_threshold(model, validation_pool, FPR=0.01))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Metric evaluation on a new dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "metrics = model.eval_metrics(\n",
    "    data=validation_pool,\n",
    "    metrics=['Logloss','AUC'],\n",
    "    ntree_start=0,\n",
    "    ntree_end=0,\n",
    "    eval_period=1,\n",
    "    plot=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('AUC values:\\n{}'.format(np.array(metrics['AUC'])))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature importances"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Prediction values change"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.get_feature_importance(prettified=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Loss function change"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.get_feature_importance(\n",
    "    data=validation_pool, \n",
    "    type='LossFunctionChange', \n",
    "    prettified=True\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Shap values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "shap_values = model.get_feature_importance(\n",
    "    data=pool1, \n",
    "    type='ShapValues'\n",
    ")\n",
    "\n",
    "expected_value = shap_values[0,-1]\n",
    "shap_values = shap_values[:,:-1]\n",
    "\n",
    "print(shap_values.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import shap\n",
    "\n",
    "shap.initjs()\n",
    "shap.force_plot(expected_value, shap_values[10,:], X.iloc[10,:])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "More information about shap value usage you can find in [tutorial](https://github.com/catboost/catboost/blob/master/catboost/tutorials/model_analysis/shap_values_tutorial.ipynb)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Snapshotting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# !rm 'catboost_info/snapshot.bkp'\n",
    "\n",
    "model = CatBoostClassifier(\n",
    "    iterations=100,\n",
    "    save_snapshot=True,\n",
    "    snapshot_file='snapshot.bkp',\n",
    "    snapshot_interval=1\n",
    ")\n",
    "\n",
    "model.fit(train_pool, eval_set=validation_pool, verbose=10);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Saving the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CatBoostClassifier(iterations=10)\n",
    "model.fit(train_pool, eval_set=validation_pool, verbose=False)\n",
    "model.save_model('catboost_model.bin')\n",
    "model.save_model('catboost_model.json', format='json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.load_model('catboost_model.bin')\n",
    "print(model.get_params())\n",
    "print(model.learning_rate_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Bonus"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Solving MultiClassification problem"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CatBoostClassifier(loss_function='MultiClass', iterations=50)\n",
    "\n",
    "model.fit(\n",
    "    train_pool,\n",
    "    eval_set=validation_pool,\n",
    "    verbose=False,\n",
    "    plot=True\n",
    ");"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Hyperparameter tunning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tunned_model = CatBoostClassifier(\n",
    "    iterations=1000,\n",
    "    learning_rate=0.03,\n",
    "    l2_leaf_reg=3,\n",
    "    boosting_type='Plain',\n",
    "    bootstrap_type='Bernoulli',\n",
    "    subsample=0.5,\n",
    "    rsm=0.5,\n",
    "    random_strength=1,\n",
    "    one_hot_max_size=2,\n",
    "    leaf_estimation_method='Newton',\n",
    "    leaf_estimation_iterations=5,\n",
    "    max_ctr_complexity=1,\n",
    ")\n",
    "\n",
    "tunned_model.fit(\n",
    "    X_train, y_train,\n",
    "    cat_features=cat_features,\n",
    "    verbose=False,\n",
    "    eval_set=(X_validation, y_validation),\n",
    "    plot=True\n",
    ");"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "widgets": {
   "state": {
    "1057714ebc614324aa3ba2cf69408966": {
     "views": [
      {
       "cell_index": 17
      }
     ]
    },
    "8381e9eed05f4a03905ae8a56d7ab4ea": {
     "views": [
      {
       "cell_index": 48
      }
     ]
    },
    "f49684e8c5c44241bfe2c7f577f5cb41": {
     "views": [
      {
       "cell_index": 53
      }
     ]
    }
   },
   "version": "1.2.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
